import HTMLParser
import urllib
import re


urlString = "http://www.crimea-ix.net/content/view/13/30/"
urlText = []
pathOut = "ixp_members.txt"


class Parse142 (HTMLParser.HTMLParser):
    
    check = False
    
    def handle_starttag (self, tag, attrs):
        if tag == "td":
            for name, value in attrs:
                if name == "class" and value == "tdc":
                    self.check = True
    
    def handle_data (self, data):
        if self.check == True:
            m = re.match("^\s*AS(\d+)\s*$",data)
            if m:
                data = m.group(1)
                if data not in urlText:
                    urlText.append(data)
            self.check = False


lparser = Parse142()
page = urllib.urlopen(urlString).read()
# poiche' la pagina ha caratteri in cirillico devo decodificarla prima di darla al parser
page = unicode(page, errors="ignore")
# se non elimino 2 tag <img> dal codice html (linee 415-419) il parser si ferma
page = re.sub("<img [^>]*>", "", page)
lparser.feed(page)
fileOut = open(pathOut, "a")
for item in urlText:
    #print item # debug
    print >> fileOut, "142 %s" % item
fileOut.close()